home *** CD-ROM | disk | FTP | other *** search
/ Enter 2001 August / EnterCD8.iso / Internet / HTTrack Website Copier / httrack.exe / {app} / src / htsparse.c < prev    next >
Encoding:
C/C++ Source or Header  |  2001-06-03  |  92.0 KB  |  1,978 lines

  1. /* ------------------------------------------------------------ */
  2. /*
  3. HTTrack Website Copier, Offline Browser for Windows and Unix
  4. Copyright (C) Xavier Roche and other contributors
  5.  
  6. This program is free software; you can redistribute it and/or
  7. modify it under the terms of the GNU General Public License
  8. as published by the Free Software Foundation; either version 2
  9. of the License, or any later version.
  10.  
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14. GNU General Public License for more details.
  15.  
  16. You should have received a copy of the GNU General Public License
  17. along with this program; if not, write to the Free Software
  18. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  19.  
  20.  
  21. Important notes:
  22.  
  23. - We hereby ask people using this source NOT to use it in purpose of grabbing
  24. emails addresses, or collecting any other private information on persons.
  25. This would disgrace our work, and spoil the many hours we spent on it.
  26.  
  27.  
  28. Please visit our Website: http://www.httrack.com
  29. */
  30.  
  31.  
  32. /* ------------------------------------------------------------ */
  33. /* File: Main source                                            */
  34. /* DIRECT INCLUDE TO httrack.c                                  */
  35. /* Author: Xavier Roche                                         */
  36. /* ------------------------------------------------------------ */
  37.  
  38.  
  39. #if HTS_ANALYSTE
  40. if (hts_htmlcheck(r.adr,(int)r.size,urladr,urlfil)) {
  41. #endif          
  42.   FILE* fp=NULL;      // fichier Θcrit localement                                               // et si level>0
  43.   char* adr=r.adr;    // pointeur (on parcourt)
  44.   char* lastsaved;    // adresse du dernier octet sauvΘ + 1
  45.   if ( (opt.debug>1) && (opt.log!=NULL) ) {
  46.     fspc(opt.log,"debug"); fprintf(opt.log,"scan file.."LF); test_flush;
  47.   }
  48.  
  49.  
  50.   // Indexing!
  51. #if HTS_MAKE_KEYWORD_INDEX
  52.   if (opt.kindex) {
  53.     if (index_keyword(r.adr,r.size,r.contenttype,savename,opt.path_html)) {
  54.       if ( (opt.debug>1) && (opt.log!=NULL) ) {
  55.         fspc(opt.log,"debug"); fprintf(opt.log,"indexing file..done"LF); test_flush;
  56.       }
  57.     } else {
  58.       if ( (opt.debug>1) && (opt.log!=NULL) ) {
  59.         fspc(opt.log,"debug"); fprintf(opt.log,"indexing file..error!"LF); test_flush;
  60.       }
  61.     }
  62.   }
  63. #endif
  64.  
  65.   // Now, parsing
  66.   if ((opt.getmode & 1) && (ptr>0)) {  // rΘcupΘrer les html sur disque       
  67.     // crΘer le fichier html local
  68.     HT_ADD_FOP;   // Θcrire peu α peu le fichier
  69.   }
  70.   
  71.   if (!error) {
  72.     int detect_title=0;  // dΘtection  du title
  73.     //
  74.     char* in_media=NULL; // in other media type (real media and so..)
  75.     int intag=0;         // on est dans un tag
  76.     int incomment=0;     // dans un <!--
  77.     int inscript=0;      // dans un scipt pour applets javascript)
  78.     int inscript_tag=0;  // on est dans un <body onLoad="... terminΘ par >
  79.     char inscript_tag_lastc='\0';  
  80.                            // terminaison (" ou ') du "<body onLoad=.."
  81.     int inscriptgen=0;     // on est dans un code gΘnΘrant, ex aprΦs obj.write("..
  82.     char scriptgen_q='\0'; // caractΦre faisant office de guillemet (' ou ")
  83.     int nofollow=0;        // ne pas scanner
  84.     //
  85.     int parseall_lastc='\0';    // dernier caractΦre parsΘ pour parseall
  86.     int parseall_incomment=0;   // dans un /* */ (exemple: a = /* URL */ "img.gif";)
  87.     //
  88.     char* intag_start=adr;
  89.     int intag_start_valid=0;
  90.     HT_ADD_START;    // dΘbuter
  91.  
  92.  
  93.     /* statistics */
  94.     if ((opt.getmode & 1) && (ptr>0)) { 
  95.       HTS_STAT.stat_files++;
  96.       HTS_STAT.stat_bytes+=r.size;
  97.     }
  98.  
  99.  
  100.     /* Check is the file is a .js file */
  101.     if (strfield2(r.contenttype,"application/x-javascript")!=0) {      /* JavaScript js file */
  102.       inscript=1;
  103.       intag=1;     // because aprΦs <script> on y est .. - pas utile
  104.       intag_start_valid=0;    // OUI car nous sommes dans du code, plus dans du "vrai" tag
  105.       if ((opt.debug>1) && (opt.log!=NULL)) {
  106.         fspc(opt.log,"debug"); fprintf(opt.log,"note: this file is a javascript file"LF); test_flush;
  107.       }
  108.     } else if (strfield2(r.contenttype,"audio/x-pn-realaudio")!=0) {      /* realaudio link file */
  109.       inscript=intag=1;
  110.       intag_start_valid=0;
  111.       in_media="RAM";       // real media!
  112.     }
  113.     // Hack to prevent any problems with ram files of other files
  114.     * ( r.adr + r.size ) = '\0';
  115.  
  116.  
  117.     // ------------------------------------------------------------
  118.     // analyser ce qu'il y a en mΘmoire (fichier html)
  119.     // on scanne les balises
  120.     // ------------------------------------------------------------
  121. #if HTS_ANALYSTE==2
  122.     _hts_in_html_done=0;     // 0% scannΘs
  123.     _hts_cancel=0;           // pas de cancel
  124.     _hts_in_html_parsing=1;  // flag pour indiquer un parsing
  125. #endif
  126.     base[0]='\0';    // effacer base-href
  127.     lastsaved=adr;
  128.     do {
  129.       int p=0;
  130.       int valid_p=0;      // force to take p even if == 0
  131.       error=0;
  132.  
  133.       /* Hack to avoid NULL char problems with C syntax */
  134.       /* Yes, some bogus HTML pages can embed null chars
  135.          and therefore can not be properly handled if this hack is not done
  136.       */
  137.       if ( ! (*adr) ) {
  138.         if (( ((int) adr) - ((int) r.adr) ) < r.size)
  139.           *adr=' ';
  140.       }
  141.  
  142.  
  143.  
  144.       /*
  145.       index.html built here
  146.       */
  147.       // Construction index.html (sommaire)
  148.       // Avant de tester les a href,
  149.       // Ici on teste si l'on doit construire l'index vers le(s) site(s) miroir(s)
  150.       if (!makeindex_done) {  // autoriation d'Θcrire un index
  151.         if (!detect_title) {
  152.           if (opt.depth == liens[ptr]->depth) {    // on note toujours les premiers liens
  153.             if (!in_media) {
  154.               if (opt.makeindex && (ptr>0)) {
  155.                 if (opt.getmode & 1) {  // autorisation d'Θcrire
  156.                   p=strfield(adr,"title");  
  157.                   if (p) {
  158.                     if (*(adr-1)=='/') p=0;    // /title
  159.                   } else {
  160.                     if (strfield(adr,"/html"))
  161.                       p=-1;                    // noter, mais sans titre
  162.                     else if (strfield(adr,"body"))
  163.                       p=-1;                    // noter, mais sans titre
  164.                     else if (( ((int) adr) - ((int) r.adr) ) >= (r.size-1) )
  165.                       p=-1;                    // noter, mais sans titre
  166.                   }
  167.                 } else
  168.                   p=0;
  169.                 
  170.                 if (p) {    // ok center                            
  171.                   if (makeindex_fp==NULL) {
  172.                     verif_backblue(opt.path_html);    // gΘnΘrer gif
  173.                     makeindex_fp=filecreate(fconcat(opt.path_html,"index.html"));
  174.                     if (makeindex_fp!=NULL) {
  175.                       fprintf(makeindex_fp,"<HTML>"CRLF);
  176.                       fprintf(makeindex_fp,"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"CRLF);
  177.                       fprintf(makeindex_fp,"<HEAD>"CRLF"<TITLE>");
  178.                       fprintf(makeindex_fp,"Local index");
  179.                       fprintf(makeindex_fp,"</TITLE>"CRLF"</HEAD>"CRLF"<BODY BACKGROUND=\"backblue.gif\"><H1 ALIGN=Center>");
  180.                       fprintf(makeindex_fp,"<U>Index of locally available sites:</U>"CRLF"</H1>"CRLF"<BR><BR>"CRLF);
  181.                       fprintf(makeindex_fp,"<TABLE BORDER=\"0\" WIDTH=\"100%%\" CELLSPACING=\"1\" CELLPADDING=\"0\">"CRLF);
  182.                       fprintf(makeindex_fp,"<UL>"CRLF);
  183.                     } else makeindex_done=-1;    // fait, erreur
  184.                   }
  185.                   
  186.                   if (makeindex_fp!=NULL) {
  187.                     char tempo[HTS_URLMAXSIZE*2];
  188.                     char s[HTS_URLMAXSIZE*2];
  189.                     char* a=NULL;
  190.                     char* b=NULL;
  191.                     s[0]='\0';
  192.                     if (p>0) {
  193.                       a=strchr(adr,'>');
  194.                       if (a!=NULL) {
  195.                         a++;
  196.                         while(is_space(*a)) a++;    // sauter espaces & co
  197.                         b=strchr(a,'<');   // prochain tag
  198.                       }
  199.                     }
  200.                     if (lienrelatif(tempo,liens[ptr]->sav,concat(opt.path_html,"index.html"))==0) {
  201.                       detect_title=1;      // ok dΘtectΘ pour cette page!
  202.                       makeindex_links++;   // un de plus
  203.                       strcpy(makeindex_firstlink,tempo);
  204.                       //
  205.                       fprintf(makeindex_fp,"<TR>"CRLF"<TD BACKGROUND=\"fade.gif\">"CRLF);
  206.                       //fprintf(makeindex_fp,"<UL>"CRLF);
  207.                       fprintf(makeindex_fp,"<LI>"CRLF);
  208.                       if ((b==a) || (a==NULL) || (b==NULL)) {    // pas de titre
  209.                         fprintf(makeindex_fp,"<A HREF=\"%s\">%s</A><BR>"CRLF,tempo,tempo);
  210.                       } else if ((b-a)<256) {
  211.                         b--;
  212.                         while(is_space(*b)) b--;
  213.                         strncpy(s,a,b-a+1);
  214.                         *(s+(b-a)+1)='\0';
  215.                         fprintf(makeindex_fp,"<A HREF=\"%s\">%s</A><BR>"CRLF,tempo,s);
  216.                       }
  217.                       fprintf(makeindex_fp,"</LI>"CRLF);
  218.                       //fprintf(makeindex_fp,"</UL>"CRLF);
  219.                       fprintf(makeindex_fp,"</TD>"CRLF"</TR>"CRLF);
  220.                     }
  221.                   }
  222.                 }
  223.                 
  224.               } else if (liens[ptr]->depth<opt.depth) {   // on a sautΘ level1+1 et level1
  225.                 if (makeindex_fp) {
  226.                   fprintf(makeindex_fp,"</UL>"CRLF);
  227.                   fprintf(makeindex_fp,"</TABLE>"CRLF);
  228.                   fprintf(makeindex_fp,"<BR>"CRLF"<BR>"CRLF"<BR>"CRLF"</BODY>"CRLF);
  229.                   fprintf(makeindex_fp,"<I><H6 ALIGN=\"RIGHT\">Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS"</H6></I>"CRLF);
  230.                   fprintf(makeindex_fp,"<!-- Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"CRLF);
  231.                   fprintf(makeindex_fp,"<!-- Thanks for using HTTrack Website Copier! -->"CRLF);
  232.                   if (makeindex_links == 1) {
  233.                     fprintf(makeindex_fp,"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,makeindex_firstlink);
  234.                   }
  235.                   fprintf(makeindex_fp,"</HTML>"CRLF);
  236.                   fflush(makeindex_fp);
  237.                   fclose(makeindex_fp);  // α ne pas oublier sinon on passe une nuit blanche
  238.                   makeindex_fp=NULL;
  239.                   usercommand(0,NULL,fconcat(opt.path_html,"index.html"));                            
  240.                 }
  241.                 makeindex_done=1;    // ok c'est fait
  242.               }
  243.             }
  244.           }
  245.         } // if (opt.makeindex)
  246.       }
  247.       // FIN Construction index.html (sommaire)
  248.       /*
  249.       end -- index.html built here
  250.       */
  251.       
  252.  
  253.  
  254.       /* Parse */
  255.       if (
  256.            (*adr=='<')    /* No starting tag */
  257.         && (!inscript)    /* Not in (java)script */
  258.         && (!incomment)   /* Not in comment (<!--) */
  259.       ) { 
  260.         intag=1;
  261.         parseall_incomment=0;
  262.         //inquote=0;  // effacer quote
  263.         intag_start=adr; intag_start_valid=1;
  264.         codebase[0]='\0';    // effacer Θventuel codebase
  265.         
  266.         if (opt.getmode & 1) {  // sauver html
  267.           p=strfield(adr,"</html");
  268.           if (p==0) p=strfield(adr,"<head>");
  269.           if (p) {
  270.             if (strnotempty(opt.footer)) {
  271.               char tempo[1024+HTS_URLMAXSIZE*2];
  272.               char gmttime[256];
  273.               tempo[0]='\0';
  274.               time_gmt_rfc822(gmttime);
  275.               strcpy(tempo,CRLF);
  276.               sprintf(tempo+strlen(tempo),opt.footer,jump_identification(urladr),urlfil,gmttime,"","","","","","","","");
  277.               strcat(tempo,CRLF);
  278.               //fwrite(tempo,1,strlen(tempo),fp);
  279.               HT_ADD(tempo);
  280.             }
  281.           }
  282.         }        
  283.         
  284.         // Θliminer les <!-- (commentaires) : intag dΘvalidΘ
  285.         if (*(adr+1)=='!')
  286.           if (*(adr+2)=='-')
  287.             if (*(adr+3)=='-') {
  288.               intag=0;
  289.               incomment=1;
  290.               intag_start_valid=0;
  291.             }
  292.             
  293.       }
  294.       else if (
  295.            (*adr=='>')                        /* ending tag */
  296.         && ( (!inscript) || (inscript_tag) )  /* and in tag (or in script) */
  297.       ) {
  298.         if (inscript_tag) {
  299.           inscript_tag=inscript=0;
  300.           intag=0;
  301.           incomment=0;
  302.           intag_start_valid=0;
  303.         } else if (!incomment) {
  304.           intag=0; //inquote=0;
  305.           
  306.           // entrΘe dans du javascript?
  307.           // on parse ICI car il se peut qu'on ait eu a parser les src=.. dedans
  308.           //if (!inscript) {  // sinon on est dans un obj.write("..
  309.           if ((intag_start_valid) && check_tag(intag_start,"script")) {
  310.             char* a=intag_start;    // <
  311.             // ** while(is_realspace(*(--a)));
  312.             if (*a=='<') {  // s√r que c'est un tag?
  313.               inscript=1;
  314.               intag=1;     // because aprΦs <script> on y est .. - pas utile
  315.               intag_start_valid=0;    // OUI car nous sommes dans du code, plus dans du "vrai" tag
  316.             }
  317.           }
  318.         } else {                               /* end of comment? */
  319.           // vΘrifier fermeture correcte
  320.           if ( (*(adr-1)=='-') && (*(adr-2)=='-') ) {
  321.             intag=0;
  322.             incomment=0;
  323.             intag_start_valid=0;
  324.           }
  325. #if GT_ENDS_COMMENT
  326.           /* wrong comment ending */
  327.           else {
  328.             /* check if correct ending do not exist 
  329.                <!-- foo > example <!-- bar > is sometimes accepted by browsers
  330.                when no --> is used somewhere else.. darn those browsers are dirty
  331.             */
  332.             if (!strstr(adr,"-->")) {
  333.               intag=0;
  334.               incomment=0;
  335.               intag_start_valid=0;
  336.             }
  337.           }
  338. #endif
  339.         }
  340.         //}
  341.       }
  342.       //else if (*adr==34) {
  343.       //  inquote=(inquote?0:1);
  344.       //}
  345.       else if (intag || inscript) {    // nous sommes dans un tag/commentaire, tester si on recoit un tag
  346.         int p_type=0;
  347.         int p_nocatch=0;
  348.         int p_searchMETAURL=0;  // chercher ..URL=<url>
  349.         int add_class=0;        // ajouter .class
  350.         char* p_flush=NULL;
  351.         
  352.         
  353.         // ------------------------------------------------------------
  354.         // parsing ΘvolΘ
  355.         // ------------------------------------------------------------
  356.         if (((isalpha((unsigned char)*adr)) || (*adr=='/') || (inscript) || (inscriptgen))) {  // sinon pas la peine de tester..
  357.  
  358.  
  359.           /* caractΦre de terminaison pour "miniparsing" javascript=.. ? 
  360.              (ex: <a href="javascript:()" action="foo"> ) */
  361.           if (inscript_tag) {
  362.             if (inscript_tag_lastc) {
  363.               if (*adr == inscript_tag_lastc) {
  364.                 /* sortir */
  365.                 inscript_tag=inscript=0;
  366.                 incomment=0;
  367.               }
  368.             }
  369.           }
  370.           
  371.           
  372.           // Note:
  373.           // Certaines pages ne respectent pas le html
  374.           // notamment les guillements ne sont pas fixΘs
  375.           // Nous sommes dans un tag, donc on peut faire un test plus
  376.           // large pour pouvoi prendre en compte ces particularitΘs
  377.           
  378.           // α vΘrifier: ACTION, CODEBASE, VRML
  379.           
  380.           if (in_media) {
  381.             if (strcmp(in_media,"RAM")==0) { // real media
  382.               p=0;
  383.               valid_p=1;
  384.             }
  385.           } else if (ptr>0) {        /* pas premiΦre page 0 (primary) */
  386.             p=0;  // saut pour le nom de fichier: adresse nom fichier=adr+p
  387.             
  388.             // ------------------------------
  389.             // dΘtection d'Θcriture JavaScript.
  390.             // osons les obj.write et les obj.href=.. ! osons!
  391.             // note: inscript==1 donc on sautera aprΦs les \"
  392.             if (inscript) {
  393.               if (inscriptgen) {          // on est dΘja dans un objet gΘnΘrant..
  394.                 if (*adr==scriptgen_q) {  // fermeture des " ou '
  395.                   if (*(adr-1)!='\\') {   // non
  396.                     inscriptgen=0;        // ok parsing terminΘ
  397.                   }
  398.                 }
  399.               } else {
  400.                 char* a=NULL;
  401.                 char check_this_fking_line=0;  // parsing code javascript..
  402.                 char must_be_terminated=0;     // caractΦre obligatoire de terminaison!
  403.                 int token_size;
  404.                 if (!(token_size=strfield(adr,".writeln"))) // dΘtection ...objet.write[ln]("code html")...
  405.                   token_size=strfield(adr,".write");
  406.                 if (token_size) {
  407.                   a=adr+token_size;
  408.                   while(is_realspace(*a)) a++; // sauter espaces
  409.                   if (*a=='(') {  // dΘbut parenthΦse
  410.                     check_this_fking_line=2;  // α parser!
  411.                     must_be_terminated=')';
  412.                     a++;  // sauter (
  413.                   }
  414.                 }
  415.                 // euhh ??? ???
  416.                 /* else if (strfield(adr,".href")) {  // dΘtection ...objet.href="...
  417.                 a=adr+5;
  418.                 while(is_realspace(*a)) a++; // sauter espaces
  419.                 if (*a=='=') {  // ohh un Θgal
  420.                 check_this_fking_line=1;  // α noter!
  421.                 must_be_terminated=';';   // et si t'as oubliΘ le ; tu sais pas coder
  422.                 a++;   // sauter =
  423.                 }
  424.                 
  425.               }*/
  426.                 
  427.                 // on a un truc du genre instruction"code gΘnΘrΘ" dont on parse le code
  428.                 if (check_this_fking_line) {
  429.                   while(is_realspace(*a)) a++;
  430.                   if ((*a=='\'') || (*a=='"')) {  // dΘpart de '' ou ""
  431.                     char *b;
  432.                     int ex=0;
  433.                     scriptgen_q=*a;    // quote
  434.                     b=a+1;      // dΘpart de la chaεne
  435.                     // vΘrifier forme ("code") et pas ("code"+var), ingΘrable
  436.                     do {
  437.                       a++;  // caractΦre suivant
  438.                       if (*a==scriptgen_q) if (*(a-1)!='\\')  // quote non slash
  439.                         ex=1;            // sortie
  440.                       if ((*a==10) || (*a==13))
  441.                         ex=1;
  442.                     } while(!ex);
  443.                     if (*a==scriptgen_q) {  // fin du quote
  444.                       a++;
  445.                       while(is_realspace(*a)) a++;
  446.                       if (*a==must_be_terminated) {  // parenthΦse fermante: ("..")
  447.                         
  448.                         // bon, on doit parser une ligne javascript
  449.                         // 1) si check.. ==1 alors c'est un nom de fichier direct, donc
  450.                         // on fixe p sur le saut nΘcessaire pour atteindre le nom du fichier
  451.                         // et le moteur se dΘbrouillera ensuite tout seul comme un grand
  452.                         // 2) si check==2 c'est un peu plus tordu car lα on gΘnΘre du
  453.                         // code html au sein de code javascript au sein de code html
  454.                         // dans ce cas on doit fixer un flag α un puis ensuite dans la boucle
  455.                         // on devra parser les instructions standard comme <a href etc
  456.                         // NOTE: le code javascript autogΘnΘrΘ n'est pas pris en compte!!
  457.                         // (et ne marche pas dans 50% des cas de toute facon!)
  458.                         if (check_this_fking_line==1) {
  459.                           p=(int) b-(int) adr;  // calculer saut!
  460.                         } else {
  461.                           inscriptgen=1;        // SCRIPTGEN actif
  462.                           adr=b;                // jump
  463.                         }
  464.                         
  465.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  466.                           char str[512];
  467.                           str[0]='\0';
  468.                           strncat(str,b,minimum((int) a-(int) b+1,32));
  469.                           fspc(opt.log,"debug"); fprintf(opt.log,"active code (%s) detected in javascript: %s"LF,(check_this_fking_line==2)?"parse":"pickup",str); test_flush;
  470.                         }
  471.                       }
  472.                       
  473.                     }
  474.                     
  475.                   }
  476.                   
  477.                   
  478.                 }
  479.               }
  480.             }
  481.             // fin detection code gΘnΘrant javascript vers html
  482.             // ------------------------------
  483.             
  484.             
  485.             // analyse proprement dite, A HREF=.. etc..
  486.             if (!p) {
  487.               // si dans un tag, et pas dans un script - sauf si on analyse un obj.write("..
  488.               if ((intag && (!inscript)) || inscriptgen) {
  489.                 if ( (*(adr-1)=='<') || (is_space(*(adr-1))) ) {   // <tag < tag etc
  490.                   // <A HREF=.. pour les liens HTML
  491.                   p=rech_tageq(adr,"href");
  492.                   if (p) {    // href.. tester si c'est une bas href!
  493.                     if ((intag_start_valid) && check_tag(intag_start,"base")) {  // oui!
  494.                       // ** note: base href et codebase ne font pas bon mΘnage..
  495.                       p_type=2;    // c'est un chemin
  496.                     }
  497.                   }
  498.                   
  499.                   /* Tags supplΘmentaires α vΘrifier (<img src=..> etc) */
  500.                   if (p==0) {
  501.                     int i=0;
  502.                     while( (p==0) && (strnotempty(hts_detect[i])) ) {
  503.                       p=rech_tageq(adr,hts_detect[i]);
  504.                       i++;
  505.                     }
  506.                   }
  507.  
  508.                   /* Tags supplΘmentaires α vΘrifier : URL=.. */
  509.                   if (p==0) {
  510.                     int i=0;
  511.                     while( (p==0) && (strnotempty(hts_detectURL[i])) ) {
  512.                       p=rech_tageq(adr,hts_detectURL[i]);
  513.                       i++;
  514.                     }
  515.                     if (p)
  516.                       p_searchMETAURL=1;
  517.                   }
  518.  
  519.                   /* Tags supplΘmentaires α vΘrifier, mais α ne pas capturer */
  520.                   if (p==0) {
  521.                     int i=0;
  522.                     while( (p==0) && (strnotempty(hts_detectandleave[i])) ) {
  523.                       p=rech_tageq(adr,hts_detectandleave[i]);
  524.                       i++;
  525.                     }
  526.                     if (p)
  527.                       p_nocatch=1;      /* ne pas rechercher */
  528.                   }
  529.  
  530.                   /* EvΘnements */
  531.                   if (p==0) {
  532.                     int i=0;
  533.                     /* dΘtection onLoad etc */
  534.                     while( (p==0) && (strnotempty(hts_detect_js[i])) ) {
  535.                       p=rech_tageq(adr,hts_detect_js[i]);
  536.                       i++;
  537.                     }
  538.                     /* non dΘtectΘ - dΘtecter Θgalement les onXxxxx= */
  539.                     if (p==0) {
  540.                       if ( (*adr=='o') && (*(adr+1)=='n') && isUpperLetter(*(adr+2)) ) {
  541.                         p=0;
  542.                         while(isalpha((unsigned char)adr[p]) && (p<64) ) p++;
  543.                         if (p<64) {
  544.                           while(is_space(adr[p])) p++;
  545.                           if (adr[p]=='=')
  546.                             p++;
  547.                           else p=0;
  548.                         } else p=0;
  549.                       }
  550.                     }
  551.                     /* OK, ΘvΘnement repΘrΘ */
  552.                     if (p) {
  553.                       inscript_tag_lastc=*(adr+p-1);     /* α attendre α la fin */
  554.                       adr+=p;     /* saut */
  555.                                   /*
  556.                                   On est dΘsormais dans du code javascript
  557.                       */
  558.                       inscript_tag=inscript=1;
  559.                     }
  560.                     p=0;        /* quoi qu'il arrive, ne rien dΘmarrer ici */
  561.                   }
  562.  
  563.                   // <APPLET CODE=.. pour les applet java.. [CODEBASE (chemin..) α faire]
  564.                   if (p==0) {
  565.                     p=rech_tageq(adr,"code");
  566.                     if (p) {
  567.                       if ((intag_start_valid) && check_tag(intag_start,"applet")) {  // dans un <applet !
  568.                         p_type=-1;  // juste le nom de fichier+dossier, Θcire avant codebase 
  569.                         add_class=1;   // ajouter .class au besoin                         
  570.                         
  571.                         // vΘrifier qu'il n'y a pas de codebase APRES
  572.                         // sinon on swappe les deux.
  573.                         // pas trΦs propre mais c'est ce qu'il y a de plus simple α faire!!
  574.                         
  575.                         {
  576.                           char *a;
  577.                           a=adr;
  578.                           while((*a) && (*a!='>') && (!rech_tageq(a,"codebase"))) a++;
  579.                           if (rech_tageq(a,"codebase")) {  // banzai! codebase=
  580.                             char* b;
  581.                             b=strchr(a,'>');
  582.                             if (b) {
  583.                               if (((int) b - (int) adr) < 1000) {    // au total < 1Ko
  584.                                 char tempo[HTS_URLMAXSIZE*2];
  585.                                 tempo[0]='\0';
  586.                                 strncat(tempo,a,(int) b - (int) a);
  587.                                 strcat( tempo," ");
  588.                                 strncat(tempo,adr,(int) a - (int) adr - 1);
  589.                                 // Θventuellement remplire par des espaces pour avoir juste la taille
  590.                                 while((int) strlen(tempo)<((int) b - (int) adr))
  591.                                   strcat(tempo," ");
  592.                                 // pas d'erreur?
  593.                                 if ((int) strlen(tempo) == ((int) b - (int) adr)) {
  594.                                   strncpy(adr,tempo,strlen(tempo));   // PAS d'octet nul α la fin!
  595.                                   p=0;    // DEVALIDER!!
  596.                                   p_type=0;
  597.                                   add_class=0;
  598.                                 }
  599.                               }
  600.                             }
  601.                           }
  602.                         }
  603.                         
  604.                       }
  605.                     }
  606.                   }
  607.                   
  608.                   // liens α patcher mais pas α charger (ex: codebase)
  609.                   if (p==0) {  // note: si non chargΘ (ex: ignorer .class) patchΘ tout de mΩme
  610.                     p=rech_tageq(adr,"codebase");
  611.                     if (p) {
  612.                       if ((intag_start_valid) && check_tag(intag_start,"applet")) {  // dans un <applet !
  613.                         p_type=-2;
  614.                       } else p=-1;   // ne plus chercher
  615.                     }
  616.                   }
  617.                   
  618.                   
  619.                   // Meta tags pour robots
  620.                   if (p==0) {
  621.                     if (opt.robots) {
  622.                       if ((intag_start_valid) && check_tag(intag_start,"meta")) {
  623.                         if (rech_tageq(adr,"name")) {    // name=robots.txt
  624.                           char tempo[1100];
  625.                           char* a;
  626.                           tempo[0]='\0';
  627.                           a=strchr(adr,'>');
  628. #if DEBUG_ROBOTS
  629.                           printf("robots.txt meta tag detected\n");
  630. #endif
  631.                           if (a) {
  632.                             if (((int) a - (int) adr) < 999 ) {
  633.                               strncat(tempo,adr,(int) a - (int) adr);
  634.                               if (strstrcase(tempo,"content")) {
  635.                                 if (strstrcase(tempo,"robots")) {
  636.                                   if (strstrcase(tempo,"nofollow")) {
  637. #if DEBUG_ROBOTS
  638.                                     printf("robots.txt meta tag: nofollow in %s%s\n",urladr,urlfil);
  639. #endif
  640.                                     nofollow=1;       // NE PLUS suivre liens dans cette page
  641.                                     if (opt.errlog) {
  642.                                       fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s%s not scanned (follow robots meta tag)"LF,urladr,urlfil);
  643.                                       test_flush;
  644.                                     }
  645.                                   }
  646.                                 }
  647.                               }
  648.                             }
  649.                           }
  650.                         }
  651.                       }
  652.                     }
  653.                   }
  654.                   
  655.                   // entrΘe dans une applet javascript
  656.                   /*if (!inscript) {  // sinon on est dans un obj.write("..
  657.                   if (p==0)
  658.                   if (rech_sampletag(adr,"script"))
  659.                   if (check_tag(intag_start,"script")) {
  660.                   inscript=1;
  661.                   }
  662.                         }*/
  663.                   
  664.                   // Ici on procΦde α une analyse du code javascript pour tenter de rΘcupΘrer
  665.                   // certains fichiers Θvidents.
  666.                   // C'est devenu obligatoire vu le nombre de pages qui intΦgrent
  667.                   // des images rΘactives par exemple
  668.                 }
  669.               } else if (inscript) {
  670.                 if (strfield(adr,"/script") ) {
  671.                   char* a=adr;
  672.                   //while(is_realspace(*(--a)));
  673.                   while( is_realspace(*a) ) a--;
  674.                   a--;
  675.                   if (*a=='<') {  // s√r que c'est un tag?
  676.                     inscript=0;
  677.                   }
  678.                 } else {
  679.                   int nc;
  680.                   char  expected     = '=';          // caractΦre attendu aprΦs
  681.                   char* expected_end = ";";
  682.                   if (inscript_tag)
  683.                     expected_end=";\"\'";            // voir a href="javascript:doc.location='foo'"
  684.                   nc = strfield(adr,".src");  // nom.src="image";
  685.                   if (!nc) nc = strfield(adr,".location");  // document.location="doc"
  686.                   if (!nc) nc = strfield(adr,".href");  // document.location="doc"
  687.                   if (!nc) if ( (nc = strfield(adr,".open")) ) { // window.open("doc",..
  688.                     expected='(';    // parenthΦse
  689.                     expected_end="),";  // fin: virgule ou parenthΦse
  690.                   }
  691.                   if (!nc) if ( (nc = strfield(adr,".replace")) ) { // window.replace("url")
  692.                     expected='(';    // parenthΦse
  693.                     expected_end=")";  // fin: parenthΦse
  694.                   }
  695.                   if (!nc) if ( (nc = strfield(adr,".link")) ) { // window.link("url")
  696.                     expected='(';    // parenthΦse
  697.                     expected_end=")";  // fin: parenthΦse
  698.                   }
  699.                   if (nc) {
  700.                     char *a;
  701.                     a=adr+nc;
  702.                     while(is_space(*a)) a++;
  703.                     if (*a == expected) {
  704.                       a++;
  705.                       while(is_realspace(*a)) a++;
  706.                       if ((*a==34) || (*a=='\'')) {
  707.                         char *b,*c;
  708.                         a++;
  709.                         b=a;
  710.                         while((*b!=34) && (*b!='\'') && (*b!='\0')) b++;
  711.                         c=b--; c++;
  712.                         while(*c==' ') c++;
  713.                         if ((strchr(expected_end,*c)) || (*c=='\n') || (*c=='\r')) {
  714.                           c-=2;
  715.                           if ((int) c-(int) a+1) {
  716.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  717.                               char str[512];
  718.                               str[0]='\0';
  719.                               strncat(str,a,minimum((int) c-(int) a+1,32));
  720.                               fspc(opt.log,"debug"); fprintf(opt.log,"link detected in javascript: %s"LF,str); test_flush;
  721.                             }
  722.                             p=(int) a- (int) adr;    // p non nul: TRAITER CHAINE COMME FICHIER
  723.                           }
  724.                         }
  725.                         
  726.                         
  727.                       }
  728.                     }
  729.                   }
  730.                   
  731.                 }
  732.               }
  733.             }
  734.                 
  735.           } else
  736.             p=rech_tageq(adr,"primary");    // lien primaire, yeah
  737.           }
  738.           
  739.           
  740.           // ------------------------------------------------------------
  741.           // dernier recours - parsing "sale" : dΘtection systΘmatique des .gif, etc.
  742.           // risque: gΘnΘrer de faux fichiers parazites
  743.           // fix: ne parse plus dans les commentaires
  744.           // ------------------------------------------------------------
  745.           if ( (opt.parseall) && (ptr>0) && (!in_media) ) {           // option parsing "brut"
  746.             int incomment_justquit=0;
  747.             if (!is_realspace(*adr)) {
  748.               int noparse=0;
  749.  
  750.               // Gestion des /* */
  751.               if (inscript) {
  752.                 if (parseall_incomment) {
  753.                   if ((*adr=='/') && (*(adr-1)=='*'))
  754.                     parseall_incomment=0;
  755.                   incomment_justquit=1;       // ne pas noter dernier caractΦre
  756.                 } else {
  757.                   if ((*adr=='/') && (*(adr+1)=='*'))
  758.                     parseall_incomment=1;
  759.                 }
  760.               } else
  761.                 parseall_incomment=0;
  762.  
  763.               /* vΘrifier que l'on est pas dans un <!-- --> pur */
  764.               if ( (!intag) && (incomment) && (!inscript))
  765.                 noparse=1;        /* commentaire */
  766.  
  767.               // recherche d'URLs
  768.               if ((!parseall_incomment) && (!noparse)) {
  769.                 if (!p) {                   // non dΘja trouvΘ
  770.                   if (adr != r.adr) {     // >1 caractΦre
  771.                     // scanner les chaines
  772.                     if ((*adr == '\"') || (*adr=='\'')) {         // "xx.gif" 'xx.gif'
  773.                       if (strchr("=(,",parseall_lastc)) {    // exemple: a="img.gif..
  774.                         char *a=adr;
  775.                         char stop=*adr;  // " ou '
  776.                         int count=0;
  777.                         
  778.                         // sauter caractΦres
  779.                         a++;
  780.                         // copier
  781.                         while((*a) && (*a!='\'') && (*a!='\"') && (count<HTS_URLMAXSIZE)) { count++; a++; }
  782.                         
  783.                         // ok chaine terminΘe par " ou '
  784.                         if ((*a == stop) && (count<HTS_URLMAXSIZE) && (count>0)) {
  785.                           char c;
  786.                           char* aend;
  787.                           //
  788.                           aend=a;     // sauver dΘbut
  789.                           a++;
  790.                           while(is_realspace(*a)) a++;
  791.                           c=*a;
  792.                           if (strchr("),;>/+",c)) {     // exemple: ..img.gif";
  793.                             // le / est pour funct("img.gif" /* URL */);
  794.                             char tempo[HTS_URLMAXSIZE*2];
  795.                             char type[256];
  796.                             int url_ok=0;      // url valide?
  797.                             tempo[0]='\0'; type[0]='\0';
  798.                             //
  799.                             strncat(tempo,adr+1,count);
  800.                             //
  801.                             if ((!strchr(tempo,' ')) || inscript) {   // espace dedans: mΘfiance! (sauf dans code javascript)
  802.                               int invalid_url=0;
  803.                               
  804.                               // Couper au # ou ? Θventuel
  805.                               {
  806.                                 char* a=strchr(tempo,'#');
  807.                                 if (a)
  808.                                   *a='\0';
  809.                                 a=strchr(tempo,'?');
  810.                                 if (a)
  811.                                   *a='\0';
  812.                               }
  813.  
  814.                               // vΘrifier qu'il n'y a pas de caractΦres spΘciaux
  815.                               if (strchr(tempo,'*')
  816.                                 || strchr(tempo,'<')
  817.                                 || strchr(tempo,'>'))
  818.                                 invalid_url=1;
  819.                               
  820.                               /* non invalide? */
  821.                               if (!invalid_url) {
  822.                                 // Un plus α la fin? Alors ne pas prendre sauf si extension ("/toto.html#"+tag)
  823.                                 if (c!='+') {    // PAS de plus α la fin
  824.                                   char* a;
  825.                                   if ((strncmp(tempo,"http://",7)==0) || (strncmp(tempo,"ftp://",6)==0))  // ok pas de problΦme
  826.                                     url_ok=1;
  827.                                   else if (tempo[strlen(tempo)-1]=='/') {        // un slash: ok..
  828.                                     if (inscript)   // sinon si pas javascript, mΘfiance (rΘpertoire style base?)
  829.                                       url_ok=1;
  830.                                   } else if ((a=strchr(tempo,'/'))) {        // un slash: ok..
  831.                                     if (inscript) {    // sinon si pas javascript, mΘfiance (style "text/css")
  832.                                       if (strchr(a+1,'/'))  // un seul / : abandon (STYLE type='text/css')
  833.                                         url_ok=1;
  834.                                     }
  835.                                   }
  836.                                 }
  837.                                 // Prendre si extension reconnue
  838.                                 if (!url_ok) {
  839.                                   get_httptype(type,tempo,0);
  840.                                   if (strnotempty(type))     // type reconnu!
  841.                                     url_ok=1;
  842.                                   else if (is_dyntype(get_ext(tempo)))  // reconnu php,cgi,asp..
  843.                                     url_ok=1;
  844.                                 }
  845.                                 //
  846.                                 // Ok, cela pourrait Ωtre une URL
  847.                                 if (url_ok) {
  848.                                   // Accepter URL, on la traitera comme une URL normale!!
  849.                                   p=1;
  850.                                 }
  851.                               }
  852.                             }
  853.                           }
  854.                         }
  855.                       }
  856.                     }
  857.                   }
  858.                 }  // p == 0
  859.                 
  860.                 // plus dans un commentaire
  861.                 if (!incomment_justquit)
  862.                   parseall_lastc=*adr;             // caractΦre avant le prochain
  863.                 
  864.               } // not in comment
  865.               
  866.             }  // if realspace
  867.           }  // if parseall
  868.           
  869.           
  870.           // ------------------------------------------------------------
  871.           // p!=0 : on a repΘrΘ un Θventuel lien
  872.           // ------------------------------------------------------------
  873.           //
  874.           if ((p>0) || (valid_p)) {    // on a repΘrΘ un lien
  875.             //int lien_valide=0;
  876.             char* eadr=NULL;          /* fin de l'URL */
  877.             char* quote_adr=NULL;     /* adresse du ? dans l'adresse */
  878.             int ok=1;
  879.             char quote='\0';
  880.  
  881.             // TEST
  882.             /*{
  883.               static int loop=0;
  884.               if ((++loop)%5000==0)
  885.                 loop=0;
  886.             }*/
  887.             
  888.             // si nofollow a ΘtΘ dΘclenchΘ, rΘΘcrire tous les liens en externe
  889.             if (nofollow)
  890.               p_nocatch=1;
  891.  
  892.             // Θcrire codebase avant, flusher avant code
  893.             if ((p_type==-1) || (p_type==-2)) {
  894.               if ((opt.getmode & 1) && (ptr>0)) {
  895.                 HT_ADD_ADR;    // refresh
  896.               }
  897.               lastsaved=adr;    // dernier Θcrit+1
  898.             }
  899.             
  900.             // sauter espaces
  901.             adr+=p;
  902.             while((is_space(*adr)) && (quote=='\0')) {
  903.               if (!quote)
  904.                 if ((*adr=='\"') || (*adr=='\''))
  905.                   quote=*adr;                     // on doit attendre cela α la fin
  906.                                                   // puis quitter
  907.                 adr++;    // sauter les espaces, "" et cie
  908.             }
  909.             /* s'arrΩter que ce soit un ' ou un " : pour document.write('<img src="foo'+a); par exemple! */
  910.             if (inscript)
  911.               quote='\0';
  912.             
  913.             // sauter Θventuel \" ou \' javascript
  914.             if (inscript) {    // on est dans un obj.write("..
  915.               if (*adr=='\\') {
  916.                 if ((*(adr+1)=='\'') || (*(adr+1)=='"')) {  // \" ou \'
  917.                   adr+=2;    // sauter
  918.                 }
  919.               }
  920.             }
  921.             
  922.             // sauter content="1;URL=http://..
  923.             if (p_searchMETAURL) {
  924.               int l=0;
  925.               while(!strfield(adr+l,"URL=") && (l<128) ) l++;
  926.               if (!strfield(adr,"URL="))
  927.                 ok=-1;
  928.               else
  929.                 adr+=(l+4);
  930.             }
  931.  
  932.             /* Θviter les javascript:document.location=.. : les parser, plut⌠t */
  933.             if (ok!=-1) {
  934.               if (strfield(adr,"javascript:")) {
  935.                 ok=-1;
  936.                 /*
  937.                 On est dΘsormais dans du code javascript
  938.                 */
  939.                 inscript_tag=inscript=1;
  940.                 inscript_tag_lastc=quote;     /* α attendre α la fin */
  941.               }
  942.             }
  943.             
  944.             if (p_type==1) {
  945.               if (*adr=='#') {
  946.                 adr++;           // sauter # pour usemap etc
  947.               }
  948.             }
  949.             eadr=adr;
  950.             
  951.             // ne pas flusher aprΦs code si on doit Θcrire le codebase avant!
  952.             if ((p_type!=-1) && (p_type!=2) && (p_type!=-2)) {
  953.               if ((opt.getmode & 1) && (ptr>0)) {
  954.                 HT_ADD_ADR;    // refresh
  955.               }
  956.               lastsaved=adr;    // dernier Θcrit+1
  957.               // aprΦs on Θcrira soit les donnΘes initiales,
  958.               // soir une URL/lien modifiΘ!
  959.             } else if (p_type==-1) p_flush=adr;    // flusher jusqu'α adr ensuite
  960.             
  961.             if (ok!=-1) {    // continuer
  962.               // dΘcouper le lien
  963.               do {
  964.                 if ((* (unsigned char*) eadr)<32) {   // caractΦre de contr⌠le (ou \0)
  965.                   if (!is_space(*eadr))
  966.                     ok=0; 
  967.                 }
  968.                 if ( ( ((int) eadr) - ((int) adr) ) > HTS_URLMAXSIZE)  // ** trop long, >HTS_URLMAXSIZE caractΦres (on prΘvoit HTS_URLMAXSIZE autres pour path)
  969.                   ok=-1;    // ne pas traiter ce lien
  970.                 
  971.                 if (ok) {
  972.                   //if (*eadr!=' ') {  
  973.                   if (is_space(*eadr)) {   // guillemets,CR, etc
  974.                     if ((!quote) || (*eadr==quote))     // si pas d'attente de quote spΘciale ou si quote atteinte
  975.                       ok=0; 
  976.                   }
  977.                   else {
  978.                     switch(*eadr) {
  979.                     case '>': 
  980.                       if (!quote) {
  981.                         if (!inscript) {
  982.                           intag=0;    // PLUS dans un tag!
  983.                           intag_start_valid=0;
  984.                         }
  985.                         ok=0;
  986.                       }
  987.                       break;
  988.                       /*case '<':*/ case '#': ok=0; break;    // case '?': non!
  989.                     case '\\': if (inscript) ok=0; break;     // \" ou \' point d'arrΩt
  990.                     case '?': quote_adr=adr; break;           // noter position query
  991.                     }
  992.                   }
  993.                   //}
  994.                 } 
  995.                 eadr++;
  996.               } while(ok==1);     
  997.               
  998.               if ( (((int) eadr)-((int) adr)) <= 1) ok=-1;     // lien vide
  999.             }
  1000.             
  1001.             if (ok==0) {    // tester un lien
  1002.               char lien[HTS_URLMAXSIZE*2];
  1003.               int meme_adresse=0;      // 0 par dΘfaut pour primary
  1004.               //char *copie_de_adr=adr;
  1005.               //char* p;
  1006.               
  1007.               // construire lien (dΘcoupage)
  1008.               if ( (((int) eadr)-((int) adr)-1) < HTS_URLMAXSIZE  ) {    // pas trop long?
  1009.                 strncpy(lien,adr,((int) eadr)-((int) adr)-1);
  1010.                 *(lien+  (((int) eadr)-((int) adr))-1  )='\0';
  1011.                 //printf("link: %s\n",lien);          
  1012.                 // supprimer les espaces
  1013.                 while((lien[strlen(lien)-1]==' ') && (strnotempty(lien))) lien[strlen(lien)-1]='\0';
  1014.                 // supprimer les // en / (sauf pour http://)
  1015.                 {
  1016.                   char *a,*p,*q;
  1017.                   int done=0;
  1018.                   a=strstr(lien,":/");    // http://
  1019.                   if (a) {
  1020.                     a++;
  1021.                     while(*a=='/') a++;    // position aprΦs http://
  1022.                   } else {
  1023.                     a=lien;                // dΘbut
  1024.                   }
  1025.                   q=strchr(a,'?');     // ne pas traiter aprΦs '?'
  1026.                   if (!q)
  1027.                     q=a+strlen(a)-1;
  1028.                   while(( p=strstr(a,"//")) && (!done) ) {    // remplacer // par /
  1029.                     if ((int) p>(int) q) {   // aprΦs le ? (toto.cgi?param=1//2.3)
  1030.                       done=1;    // stopper
  1031.                     } else {
  1032.                       char tempo[HTS_URLMAXSIZE*2];
  1033.                       tempo[0]='\0';
  1034.                       strncat(tempo,a,(int) p - (int) a);
  1035.                       strcat (tempo,p+1);
  1036.                       strcpy(a,tempo);    // recopier
  1037.                     }
  1038.                   }
  1039.                 }
  1040.               } else
  1041.                 lien[0]='\0';    // erreur
  1042.               
  1043.               // ------------------------------------------------------
  1044.               // Lien repΘrΘ et extrait
  1045.               if (strnotempty(lien)>0) {           // construction du lien
  1046.                 char adr[HTS_URLMAXSIZE*2],fil[HTS_URLMAXSIZE*2];          // ATTENTION adr cache le "vrai" adr
  1047.                 int forbidden_url=-1;              // lien non interdit (mais non autorisΘ..)
  1048.                 int just_test_it=0;                // mode de test des liens
  1049.                 int set_prio_to=0;                 // pour capture de page isolΘe
  1050.                 int import_done=0;                 // lien importΘ (ne pas scanner ensuite *α priori*)
  1051.                 //
  1052.                 adr[0]='\0'; fil[0]='\0';
  1053.                 //
  1054.                 // 0: autorisΘ
  1055.                 // 1: interdit (patcher tout de mΩme adresse)
  1056.                 
  1057.                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1058.                   fspc(opt.log,"debug"); fprintf(opt.log,"link detected in html: %s"LF,lien); test_flush;
  1059.                 }
  1060.  
  1061.                 // purger CR,LF rΘsiduels (IMG SRC="foo.<\n>gif")
  1062.                 {
  1063.                   char* a;
  1064.                   while ((a=strchr(lien,'\n'))) {
  1065.                     char tempo[HTS_URLMAXSIZE*2];
  1066.                     tempo[0]='\0';
  1067.                     strncat(tempo,lien,(int) a - (int) lien);
  1068.                     strcat(tempo,a+1);
  1069.                     strcpy(lien,tempo);
  1070.                   }
  1071.                   while ((a=strchr(lien,'\r'))) {
  1072.                     char tempo[HTS_URLMAXSIZE*2];
  1073.                     tempo[0]='\0';
  1074.                     strncat(tempo,lien,(int) a - (int) lien);
  1075.                     strcat(tempo,a+1);
  1076.                     strcpy(lien,tempo);
  1077.                   }
  1078.                 }
  1079.                 
  1080.                 /* Unescape/escape %20 and other   */
  1081.                 {
  1082.                   char query[HTS_URLMAXSIZE*2];
  1083.                   char* a=strchr(lien,'?');
  1084.                   if (a) {
  1085.                     strcpy(query,a);
  1086.                     *a='\0';
  1087.                   } else
  1088.                     query[0]='\0';
  1089.                   // conversion & -> & et autres joyeusetΘs
  1090.                   unescape_amp(lien);
  1091.                   // dΘcoder l'inutile (%2E par exemple) et coder espaces
  1092.                   // XXXXXXXXXXXXXXXXX strcpy(lien,unescape_http(lien));
  1093.                   strcpy(lien,unescape_http_unharm(lien));
  1094.                   escape_spc_url(lien);
  1095.                   strcat(lien,query);     /* restore */
  1096.                 }
  1097.                 
  1098.                 // convertir les Θventuels \ en des / pour Θviter des problΦmes de reconnaissance!
  1099.                 {
  1100.                   char* a=jump_identification(lien);
  1101.                   while( (a=strchr(a,'\\')) ) *a='/';
  1102.                 }
  1103.                 
  1104.                 // supprimer le(s) ./
  1105.                 while ((lien[0]=='.') && (lien[1]=='/')) {
  1106.                   char tempo[HTS_URLMAXSIZE*2];
  1107.                   strcpy(tempo,lien+2);
  1108.                   strcpy(lien,tempo);
  1109.                 }
  1110.                 if (strnotempty(lien)==0)  // sauf si plus de nom de fichier
  1111.                   strcpy(lien,"./");
  1112.                 
  1113.                 // vΘrifie les /~machin -> /~machin/
  1114.                 // supposition dangereuse?
  1115.                 if (lien[strlen(lien)-1]!='/') {
  1116.                   char *a=lien+strlen(lien)-1;
  1117.                   // Θviter aussi index~1.html
  1118.                   while (((int) a>(int) lien) && (*a!='~') && (*a!='/') && (*a!='.')) a--;
  1119.                   if (*a=='~') {
  1120.                     strcat(lien,"/");    // ajouter slash
  1121.                   }
  1122.                 }
  1123.                 
  1124.                 
  1125.                 // Θliminer les Θventuels :80 (port par dΘfaut!)
  1126.                 {
  1127.                   char* a;
  1128.                   a=strstr(lien,"://");
  1129.                   if (a)
  1130.                     a+=3;
  1131.                   else
  1132.                     a=lien;
  1133.                   while((*a) && (*a!='/') && (*a!=':')) a++;
  1134.                   if (*a==':') {  // port
  1135.                     int port=0;
  1136.                     char* b=a+1;
  1137.                     while(isdigit((unsigned char)*b)) { port*=10; port+=(int) (*b-'0'); b++; }
  1138.                     if (port==80) {  // port 80, default
  1139.                       char tempo[HTS_URLMAXSIZE*2];
  1140.                       tempo[0]='\0';
  1141.                       strncat(tempo,lien,(int) a-(int) lien);
  1142.                       strcat(tempo,a+3);  // sauter :80
  1143.                       strcpy(lien,tempo);
  1144.                     }
  1145.                   }
  1146.                 }
  1147.                 
  1148.                 // filtrer les parazites (mailto & cie)
  1149.                 if (strfield(lien,"mailto:")) {  // ne pas traiter
  1150.                   error=1;
  1151.                 } else if (strfield(lien,"news:")) {  // ne pas traiter
  1152.                   error=1;
  1153.                 }
  1154.                 
  1155.                 // vΘrifier que l'on ne doit pas ajouter de .class
  1156.                 if (!error) {
  1157.                   if (add_class) {
  1158.                     char *a = lien+strlen(lien)-1;
  1159.                     while(((int) a > (int) lien) && (*a!='/') && (*a!='.')) a--;
  1160.                     if (*a != '.')
  1161.                       strcat(lien,".class");    // ajouter .class
  1162.                   }
  1163.                 }
  1164.                 
  1165.                 // si c'est un chemin, alors vΘrifier (toto/toto.html -> http://www/toto/)
  1166.                 if (!error) {
  1167.                   if ((opt.debug>1) && (opt.log!=NULL)) {
  1168.                     fspc(opt.log,"debug"); fprintf(opt.log,"position link check %s"LF,lien); test_flush;
  1169.                   }
  1170.                   
  1171.                   if ((p_type==2) || (p_type==-2)) {   // code ou codebase                        
  1172.                     // VΘrifier les codebase=applet (au lieu de applet/)
  1173.                     if (p_type==-2) {    // codebase
  1174.                       if (strnotempty(lien)) {
  1175.                         if (fil[strlen(lien)-1]!='/') {  // pas rΘpertoire
  1176.                           strcat(lien,"/");
  1177.                         }
  1178.                       }
  1179.                     }
  1180.                     /* only one ending / (bug on some pages) */
  1181.                     if ((int)strlen(lien)>2) {
  1182.                       while( (lien[strlen(lien)-2]=='/') && ((int)strlen(lien)>2) )    /* double // (bug) */
  1183.                         lien[strlen(lien)-1]='\0';
  1184.                     }
  1185.                     // copier nom host si besoin est
  1186.                     if (strstr(lien,"://")==NULL) {  // pas de http://
  1187.                       char adr2[HTS_URLMAXSIZE*2],fil2[HTS_URLMAXSIZE*2];  // ** euh ident_url_relatif??
  1188.                       if (ident_url_relatif(lien,urladr,urlfil,adr2,fil2)<0) {                        
  1189.                         error=1;
  1190.                       } else {
  1191.                         strcpy(lien,"http://");
  1192.                         strcat(lien,adr2);
  1193.                         if (*fil2!='/')
  1194.                           strcat(lien,"/");
  1195.                         strcat(lien,fil2);
  1196.                         {
  1197.                           char* a;
  1198.                           a=lien+strlen(lien)-1;
  1199.                           while((*a) && (*a!='/') && ((int) a> (int) lien)) a--;
  1200.                           if (*a=='/') {
  1201.                             *(a+1)='\0';
  1202.                           }
  1203.                         }
  1204.                         //char tempo[HTS_URLMAXSIZE*2];
  1205.                         //strcpy(tempo,"http://");
  1206.                         //strcat(tempo,urladr);    // host
  1207.                         //if (*lien!='/')
  1208.                         //  strcat(tempo,"/");
  1209.                         //strcat(tempo,lien);
  1210.                         //strcpy(lien,tempo);
  1211.                       }
  1212.                     }
  1213.                     
  1214.                     if (!error) {  // pas d'erreur?
  1215.                       if (p_type==2) {   // code ET PAS codebase      
  1216.                         char* a=lien+strlen(lien)-1;
  1217.                         while( ((int) a > (int) lien) && (*a) && (*a!='/')) a--;
  1218.                         if (*a=='/')     // ok on a repΘrΘ le dernier /
  1219.                           *(a+1)='\0';   // couper
  1220.                         else {
  1221.                           *lien='\0';    // Θliminer
  1222.                           error=1;   // erreur, ne pas poursuivre
  1223.                         }      
  1224.                       }
  1225.                       
  1226.                       // stocker base ou codebase?
  1227.                       switch(p_type) {
  1228.                       case 2: { 
  1229.                         //if (*lien!='/') strcat(base,"/");
  1230.                         strcpy(base,lien);
  1231.                               }
  1232.                         break;      // base
  1233.                       case -2: {
  1234.                         //if (*lien!='/') strcat(codebase,"/");
  1235.                         strcpy(codebase,lien); 
  1236.                                }
  1237.                         break;  // base
  1238.                       }
  1239.                       
  1240.                       if ((opt.debug>1) && (opt.log!=NULL)) {
  1241.                         fspc(opt.log,"debug"); fprintf(opt.log,"code/codebase link %s base %s"LF,lien,base); test_flush;
  1242.                       }
  1243.                       //printf("base code: %s - %s\n",lien,base);
  1244.                     }
  1245.                     
  1246.                   } else {
  1247.                     char* _base;
  1248.                     if (p_type==-1)   // code (applet)
  1249.                       _base=codebase;
  1250.                     else
  1251.                       _base=base;
  1252.                     
  1253.                     // ajouter chemin de base href..
  1254.                     if (strnotempty(_base)) {       // considΘrer base
  1255.                       if (!strstr(lien,"://")) {    // non absolue
  1256.                         if (*lien!='/') {           // non absolu sur le site (/)
  1257.                           if ( ((int) strlen(_base)+(int) strlen(lien))<HTS_URLMAXSIZE) {
  1258.                             char tempo[HTS_URLMAXSIZE*2];
  1259.                             // base est absolue
  1260.                             strcpy(tempo,_base);
  1261.                             strcat(tempo,lien);
  1262.                             strcpy(lien,tempo);        // patcher en considΘrant base
  1263.                             // ** vΘrifier que ../ fonctionne (ne doit pas arriver mais bon..)
  1264.                             
  1265.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  1266.                               fspc(opt.log,"debug"); fprintf(opt.log,"link modified with code/codebase %s"LF,lien); test_flush;
  1267.                             }
  1268.                           } else {
  1269.                             error=1;    // erreur
  1270.                             if (opt.errlog) {
  1271.                               fspc(opt.errlog,"error"); fprintf(opt.errlog,"Link %s too long with base href"LF,lien);
  1272.                               test_flush;
  1273.                             }
  1274.                           }
  1275.                         }
  1276.                       }
  1277.                     }
  1278.                   }
  1279.                   }
  1280.                   
  1281.                   
  1282.                   // transformer lien quelconque (http, relatif, etc) en une adresse
  1283.                   // et un chemin+fichier (adr,fil)
  1284.                   if (!error) {
  1285.                     int reponse;
  1286.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1287.                       fspc(opt.log,"debug"); fprintf(opt.log,"build relative link %s with %s%s"LF,lien,urladr,urlfil); test_flush;
  1288.                     }
  1289.                     if ((reponse=ident_url_relatif(lien,urladr,urlfil,adr,fil))<0) {                        
  1290.                       adr[0]='\0';    // erreur
  1291.                       if (reponse==-2) {
  1292.                         if (opt.errlog) {
  1293.                           fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s not caught (unknown ftp:// protocol)"LF,lien);
  1294.                           test_flush;
  1295.                         }
  1296.                       }
  1297.                     }
  1298.                   } else {
  1299.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1300.                       fspc(opt.log,"debug"); fprintf(opt.log,"link %s not build, error detected before"LF,lien); test_flush;
  1301.                     }
  1302.                     adr[0]='\0';
  1303.                   }
  1304.                   
  1305. #if HTS_CHECK_STRANGEDIR
  1306.                   // !ATTENTION!
  1307.                   // Ici on teste les exotiques du genre www.truc.fr/machin (sans slash α la fin)
  1308.                   // je n'ai pas encore trouvΘ le moyen de faire la diffΘrence entre un rΘpertoire
  1309.                   // et un fichier en http A PRIORI : je fais donc un test
  1310.                   // En cas de moved xxx, on recalcule adr et fil, tout simplement
  1311.                   // DEFAUT: test effectuΘ plusieurs fois! α revoir!!!
  1312.                   if ((adr[0]!='\0') && (strcmp(adr,"file://") && (p_type!=2) && (p_type!=-2)) {
  1313.                     //## if ((adr[0]!='\0') && (adr[0]!=lOCAL_CHAR) && (p_type!=2) && (p_type!=-2)) {
  1314.                     if (fil[strlen(fil)-1]!='/') {  // pas rΘpertoire
  1315.                       if (ishtml(fil)==-2) {    // pas d'extension
  1316.                         char loc[HTS_URLMAXSIZE*2];  // Θventuelle nouvelle position
  1317.                         loc[0]='\0';
  1318.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1319.                           fspc(opt.log,"debug"); fprintf(opt.log,"link-check-directory: %s%s"LF,adr,fil);
  1320.                           test_flush;
  1321.                         }
  1322.                         
  1323.                         // tester Θventuelle nouvelle position
  1324.                         switch (http_location(adr,fil,loc).statuscode) {
  1325.                         case 200: // ok au final
  1326.                           if (strnotempty(loc)) {  // a changΘ d'adresse
  1327.                             if (opt.errlog) {
  1328.                               fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s%s has moved to %s for %s%s"LF,adr,fil,loc,urladr,urlfil);
  1329.                               test_flush;
  1330.                             }
  1331.                             
  1332.                             // recalculer adr et fil!
  1333.                             if (ident_url(loc,adr,fil)==-1) {
  1334.                               adr[0]='\0';  // cancel
  1335.                               if ((opt.debug>1) && (opt.log!=NULL)) {
  1336.                                 fspc(opt.log,"debug"); fprintf(opt.log,"link-check-dir: %s%s"LF,adr,fil);
  1337.                                 test_flush;
  1338.                               }
  1339.                             }
  1340.                             
  1341.                           }
  1342.                           break;
  1343.                         case -2: case -3:  // timeout ou erreur grave
  1344.                           if (opt.errlog) {
  1345.                             fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Connection too slow for testing link %s%s (from %s%s)"LF,adr,fil,urladr,urlfil);
  1346.                             test_flush;
  1347.                           }
  1348.                           
  1349.                           break;
  1350.                         }
  1351.                         
  1352.                       }
  1353.                     } 
  1354.                   }
  1355. #endif
  1356.                   
  1357.                   // Le lien doit juste Ωtre rΘΘcrit, mais ne doit pas gΘnΘrer un lien
  1358.                   // exemple: <FORM ACTION="url_cgi">
  1359.                   if (p_nocatch) {
  1360.                     forbidden_url=1;    // interdire rΘcupΘration du lien
  1361.                     if ((opt.debug>1) && (opt.log!=NULL)) {
  1362.                       fspc(opt.log,"debug"); fprintf(opt.log,"link ignored at %s%s"LF,adr,fil);
  1363.                       test_flush;
  1364.                     }
  1365.                   }
  1366.                   
  1367.                   // Tester si un lien doit Ωtre acceptΘ ou refusΘ (wizard)
  1368.                   // forbidden_url=1 : lien refusΘ
  1369.                   // forbidden_url=0 : lien acceptΘ
  1370.                   //if ((ptr>0) && (p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1371.                   if ((p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1372.                     if (!p_nocatch) {
  1373.                       if (adr[0]!='\0') {          
  1374.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1375.                           fspc(opt.log,"debug"); fprintf(opt.log,"wizard link test at %s%s.."LF,adr,fil);
  1376.                           test_flush;
  1377.                         }
  1378.                         forbidden_url=hts_acceptlink(&opt,ptr,lien_tot,liens,
  1379.                           adr,fil,
  1380.                           filters,&filptr,filter_max,
  1381.                           &robots,
  1382.                           &set_prio_to,
  1383.                           &just_test_it);
  1384.                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1385.                           fspc(opt.log,"debug"); fprintf(opt.log,"result for wizard link test: %d"LF,forbidden_url);
  1386.                           test_flush;
  1387.                         }
  1388.                       }
  1389.                     }
  1390.                   }
  1391.                   
  1392.                   // calculer meme_adresse
  1393.                   meme_adresse=strfield2(jump_identification(adr),jump_identification(urladr));
  1394.                   
  1395.                   
  1396.                   
  1397.                   // DΘbut partie sauvegarde
  1398.                   
  1399.                   // ici on forme le nom du fichier α sauver, et on patche l'URL
  1400.                   if (adr[0]!='\0') {
  1401.                     // savename: simplifier les ../ et autres joyeusetΘs
  1402.                     char save[HTS_URLMAXSIZE*2];
  1403.                     int r_sv=0;
  1404.                     // En cas de moved, adresse premiΦre
  1405.                     char former_adr[HTS_URLMAXSIZE*2];
  1406.                     char former_fil[HTS_URLMAXSIZE*2];
  1407.                     //
  1408.                     save[0]='\0'; former_adr[0]='\0'; former_fil[0]='\0';
  1409.                     //
  1410.                     
  1411.                     // nom du chemin α sauver si on doit le calculer
  1412.                     // note: url_savename peut dΘcider de tester le lien si il le trouve
  1413.                     // suspect, et modifier alors adr et fil
  1414.                     // dans ce cas on aura une rΘfΘrence directe au lieu des traditionnels
  1415.                     // moved en cascade (impossible α reproduire α priori en local, lorsque des fichiers
  1416.                     // gif sont impliquΘs par exemple)
  1417.                     if ((p_type!=2) && (p_type!=-2)) {  // pas base href ou codebase
  1418.                       if (forbidden_url!=1) {
  1419.                         char last_adr[HTS_URLMAXSIZE*2];
  1420.                         last_adr[0]='\0';
  1421.                         //char last_fil[HTS_URLMAXSIZE*2]="";
  1422.                         strcpy(last_adr,adr);    // ancienne adresse
  1423.                         //strcpy(last_fil,fil);    // ancien chemin
  1424.                         r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,&opt,liens,lien_tot,back,back_max,&cache,&hash,ptr,numero_passe);
  1425.                         if (strcmp(jump_identification(last_adr),jump_identification(adr)) != 0) {  // a changΘ
  1426.                           
  1427.                           // 2e test si moved
  1428.                           
  1429.                           // Tester si un lien doit Ωtre acceptΘ ou refusΘ (wizard)
  1430.                           // forbidden_url=1 : lien refusΘ
  1431.                           // forbidden_url=0 : lien acceptΘ
  1432.                           if ((ptr>0) && (p_type!=2) && (p_type!=-2)) {    // tester autorisations?
  1433.                             if (!p_nocatch) {
  1434.                               if (adr[0]!='\0') {          
  1435.                                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1436.                                   fspc(opt.log,"debug"); fprintf(opt.log,"wizard moved link retest at %s%s.."LF,adr,fil);
  1437.                                   test_flush;
  1438.                                 }
  1439.                                 forbidden_url=hts_acceptlink(&opt,ptr,lien_tot,liens,
  1440.                                   adr,fil,
  1441.                                   filters,&filptr,filter_max,
  1442.                                   &robots,
  1443.                                   &set_prio_to,
  1444.                                   &just_test_it);
  1445.                                 if ((opt.debug>1) && (opt.log!=NULL)) {
  1446.                                   fspc(opt.log,"debug"); fprintf(opt.log,"result for wizard moved link retest: %d"LF,forbidden_url);
  1447.                                   test_flush;
  1448.                                 }
  1449.                               }
  1450.                             }
  1451.                           }
  1452.                           
  1453.                           //import_done=1;    // c'est un import!
  1454.                           meme_adresse=0;   // on a changΘ
  1455.                         }
  1456.                       } else {
  1457.                         strcpy(save,"");  // dummy
  1458.                       }
  1459.                     }
  1460.                     if (r_sv!=-1) {  // pas d'erreur, on continue
  1461.                       /* log */
  1462.                       if ((opt.debug>1) && (opt.log!=NULL)) {
  1463.                         fspc(opt.log,"debug");
  1464.                         if (forbidden_url!=1) {    // le lien va Ωtre chargΘ
  1465.                           if ((p_type==2) || (p_type==-2)) {  // base href ou codebase, pas un lien
  1466.                             fprintf(opt.log,"Code/Codebase: %s%s"LF,adr,fil);
  1467.                           } else if ((opt.getmode & 4)==0) {
  1468.                             fprintf(opt.log,"Record: %s%s -> %s"LF,adr,fil,save);
  1469.                           } else {
  1470.                             if (!ishtml(fil))
  1471.                               fprintf(opt.log,"Record after: %s%s -> %s"LF,adr,fil,save);
  1472.                             else
  1473.                               fprintf(opt.log,"Record: %s%s -> %s"LF,adr,fil,save);
  1474.                           } 
  1475.                         } else
  1476.                           fprintf(opt.log,"External: %s%s"LF,adr,fil);
  1477.                         test_flush;
  1478.                       }
  1479.                       /* FIN log */
  1480.                       
  1481.                       // Θcrire lien
  1482.                       if ((p_type==2) || (p_type==-2)) {  // base href ou codebase, sauter
  1483.                         lastsaved=eadr-1+1;  // sauter "
  1484.                       } else if (forbidden_url==1) {    // le lien ne sera pas chargΘ, rΘfΘrence externe!
  1485.                         if ((opt.getmode & 1) && (ptr>0)) {
  1486.                           if (p_type!=-1) {     // pas que le nom de fichier (pas classe java)
  1487.                             if (!opt.external) {
  1488.                               if (!strstr(adr,"://")) {
  1489.                                 HT_ADD("http://");
  1490.                               }
  1491.                               if (!opt.passprivacy) {
  1492.                                 HT_ADD(adr);     // Password
  1493.                               } else {
  1494.                                 HT_ADD(jump_identification(adr));     // No Password
  1495.                               }
  1496.                               if (*fil!='/')
  1497.                                 HT_ADD("/");
  1498.                               HT_ADD(fil);
  1499.                               //
  1500.                             } else {    // fichier/page externe, mais on veut gΘnΘrer une erreur
  1501.                               //
  1502.                               int patch_it=0;
  1503.                               int add_url=0;
  1504.                               char* cat_name=NULL;
  1505.                               char* cat_data=NULL;
  1506.                               int cat_nb=0;
  1507.                               int cat_data_len=0;
  1508.                               
  1509.                               // ajouter lien external
  1510.                               switch ((fil[strlen(fil)-1]=='/')?1:(ishtml(fil))) {
  1511.                               case 1: case -2:       // html ou rΘpertoire
  1512.                                 if (opt.getmode & 1) {  // sauver html
  1513.                                   patch_it=1;   // redirect
  1514.                                   add_url=1;    // avec link?
  1515.                                   cat_name="external.html";
  1516.                                   cat_nb=0;
  1517.                                   cat_data=HTS_DATA_UNKNOWN_HTML;
  1518.                                   cat_data_len=HTS_DATA_UNKNOWN_HTML_LEN;
  1519.                                 }
  1520.                                 break;
  1521.                               default:    // inconnu
  1522.                                 if ( (strfield2(fil+strlen(fil)-4,".gif")) 
  1523.                                   || (strfield2(fil+strlen(fil)-4,".jpg")) 
  1524.                                   || (strfield2(fil+strlen(fil)-4,".xbm")) 
  1525.                                   || (ishtml(fil)!=0) ) {
  1526.                                   patch_it=1;   // redirect
  1527.                                   add_url=1;    // avec link aussi
  1528.                                   cat_name="external.gif";
  1529.                                   cat_nb=1;
  1530.                                   cat_data=HTS_DATA_UNKNOWN_GIF;
  1531.                                   cat_data_len=HTS_DATA_UNKNOWN_GIF_LEN;
  1532.                                 }
  1533.                                 break;
  1534.                               }// html,gif
  1535.                               
  1536.                               if (patch_it) {
  1537.                                 char save[HTS_URLMAXSIZE*2];
  1538.                                 char tempo[HTS_URLMAXSIZE*2];
  1539.                                 strcpy(save,opt.path_html);
  1540.                                 strcat(save,cat_name);
  1541.                                 if (lienrelatif(tempo,save,savename)==0) {
  1542.                                   HT_ADD(tempo);    // page externe
  1543.                                   if (add_url) {
  1544.                                     HT_ADD("?link=");    // page externe
  1545.                                     if (!opt.passprivacy) {
  1546.                                       HT_ADD(adr);   // Password
  1547.                                     } else {
  1548.                                       HT_ADD(jump_identification(adr));   // No Password
  1549.                                     }
  1550.                                     if (*fil!='/')
  1551.                                       HT_ADD("/");
  1552.                                     HT_ADD(fil);
  1553.                                   }
  1554.                                 }
  1555.                                 
  1556.                                 // Θcrire fichier?
  1557.                                 if (verif_external(cat_nb,1)) {
  1558.                                 //if (!fexist(fconcat(opt.path_html,cat_name))) {
  1559.                                   FILE* fp = filecreate(fconcat(opt.path_html,cat_name));
  1560.                                   if (fp) {
  1561.                                     if (cat_data_len==0) {   // texte
  1562.                                       verif_backblue(opt.path_html);
  1563.                                       fprintf(fp,"%s%s","<!-- Created by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS" -->"LF,cat_data);
  1564.                                     } else {                    // data
  1565.                                       fwrite(cat_data,cat_data_len,1,fp);
  1566.                                     }
  1567.                                     fclose(fp);
  1568.                                     usercommand(0,NULL,fconcat(opt.path_html,cat_name));
  1569.                                   }
  1570.                                 }
  1571.                               }  else {    // Θcrire normalement le nom de fichier
  1572.                                 HT_ADD("http://");
  1573.                                 if (!opt.passprivacy) {
  1574.                                   HT_ADD(adr);       // Password
  1575.                                 } else {
  1576.                                   HT_ADD(jump_identification(adr));       // No Password
  1577.                                 }
  1578.                                 if (*fil!='/')
  1579.                                   HT_ADD("/");
  1580.                                 HT_ADD(fil);
  1581.                               }// patcher?
  1582.                             }  // external
  1583.                           } else {  // que le nom de fichier (classe java)
  1584.                             // en gros recopie de plus bas: copier codebase et base
  1585.                             if (p_flush) {
  1586.                               char tempo[HTS_URLMAXSIZE*2];    // <-- ajoutΘ
  1587.                               char tempo_pat[HTS_URLMAXSIZE*2];
  1588.                               tempo_pat[0]='\0';
  1589.                               strcpy(tempo,fil);  // <-- ajoutΘ
  1590.                               {
  1591.                                 char* a=tempo+strlen(tempo)-1;
  1592.                                 while( ((int) a > (int) tempo) && (*a) && (*a!='/')) a--;
  1593.                                 if (*a=='/') {
  1594.                                   char tempo2[HTS_URLMAXSIZE*2];
  1595.                                   strcpy(tempo2,a+1);
  1596.                                   strncat(tempo_pat,tempo,(int) a-(int) tempo+1);  // chemin
  1597.                                   strcpy(tempo,tempo2);                     // fichier
  1598.                                 }
  1599.                               }
  1600.                               
  1601.                               // Θrire codebase="chemin"
  1602.                               if ((opt.getmode & 1) && (ptr>0)) {
  1603.                                 char tempo4[HTS_URLMAXSIZE*2];
  1604.                                 tempo4[0]='\0';
  1605.                                 
  1606.                                 if (strnotempty(tempo_pat)) {
  1607.                                   HT_ADD("codebase=\"http://");
  1608.                                   if (!opt.passprivacy) {
  1609.                                     HT_ADD(adr);  // Password
  1610.                                   } else {
  1611.                                     HT_ADD(jump_identification(adr));  // No Password
  1612.                                   }
  1613.                                   if (*tempo_pat!='/') HT_ADD("/");
  1614.                                   HT_ADD(tempo_pat);
  1615.                                   HT_ADD("\" ");
  1616.                                 }
  1617.                                 
  1618.                                 strncat(tempo4,lastsaved,(int) p_flush-(int) lastsaved);
  1619.                                 HT_ADD(tempo4);    // refresh code="
  1620.                                 HT_ADD(tempo);
  1621.                               }
  1622.                             }
  1623.                           }
  1624.                         }
  1625.                         lastsaved=eadr-1;
  1626.                       } 
  1627.                       /*
  1628.                       else if (opt.urlmode==1) {    // ABSOLU, c'est le cas le moins courant
  1629.                       //  NE FONCTIONNE PAS!!  (et est inutile)
  1630.                       if ((opt.getmode & 1) && (ptr>0)) {    // ecrire les html
  1631.                       // Θcrire le lien modifiΘ, absolu
  1632.                       HT_ADD("file:");
  1633.                       if (*save=='/')
  1634.                       HT_ADD(save+1)
  1635.                       else
  1636.                       HT_ADD(save)
  1637.                       }
  1638.                       lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  1639.                       }
  1640.                       */
  1641.                       else if (opt.urlmode==3) {    // URI absolue /
  1642.                         if ((opt.getmode & 1) && (ptr>0)) {    // ecrire les html
  1643.                           HT_ADD(fil);
  1644.                         }
  1645.                         lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  1646.                       }
  1647.                       else if (opt.urlmode==2) {  // RELATIF
  1648.                         char tempo[HTS_URLMAXSIZE*2];
  1649.                         tempo[0]='\0';
  1650.                         // calculer le lien relatif
  1651.                         
  1652.                         if (lienrelatif(tempo,save,savename)==0) {
  1653.                           if ((opt.debug>1) && (opt.log!=NULL)) {
  1654.                             fspc(opt.log,"debug"); fprintf(opt.log,"relative link at %s build with %s and %s: %s"LF,adr,save,savename,tempo);
  1655.                             test_flush;
  1656.                           }
  1657.                           
  1658.                           // lien applet (code) - il faut placer un codebase avant
  1659.                           if (p_type==-1) {  // que le nom de fichier
  1660.                             
  1661.                             if (p_flush) {
  1662.                               char tempo_pat[HTS_URLMAXSIZE*2];
  1663.                               tempo_pat[0]='\0';
  1664.                               {
  1665.                                 char* a=tempo+strlen(tempo)-1;
  1666.                                 while( ((int) a > (int) tempo) && (*a) && (*a!='/')) a--;
  1667.                                 if (*a=='/') {
  1668.                                   char tempo2[HTS_URLMAXSIZE*2];
  1669.                                   strcpy(tempo2,a+1);
  1670.                                   strncat(tempo_pat,tempo,(int) a-(int) tempo+1);  // chemin
  1671.                                   strcpy(tempo,tempo2);                     // fichier
  1672.                                 }
  1673.                               }
  1674.                               
  1675.                               // Θrire codebase="chemin"
  1676.                               if ((opt.getmode & 1) && (ptr>0)) {
  1677.                                 char tempo4[HTS_URLMAXSIZE*2];
  1678.                                 tempo4[0]='\0';
  1679.                                 
  1680.                                 if (strnotempty(tempo_pat)) {
  1681.                                   HT_ADD("codebase=\"");
  1682.                                   HT_ADD(tempo_pat);
  1683.                                   HT_ADD("\" ");
  1684.                                 }
  1685.                                 
  1686.                                 strncat(tempo4,lastsaved,(int) p_flush-(int) lastsaved);
  1687.                                 HT_ADD(tempo4);    // refresh code="
  1688.                               }
  1689.                             }
  1690.                             //lastsaved=adr;    // dernier Θcrit+1
  1691.                           }                              
  1692.                           
  1693.                           if ((opt.getmode & 1) && (ptr>0)) {
  1694.                             // Θcrire le lien modifiΘ, relatif
  1695.                             HT_ADD(tempo);
  1696.                           }
  1697.                           lastsaved=eadr-1;    // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein)
  1698.                         } else {
  1699.                           if (opt.errlog) {
  1700.                             fprintf(opt.errlog,"Error building relative link %s and %s"LF,save,savename);
  1701.                             test_flush;
  1702.                           }
  1703.                         }
  1704.                       }  // sinon le lien sera Θcrit normalement
  1705.                       
  1706.                       
  1707. #if 0
  1708.                       if (fexist(save)) {    // le fichier existe..
  1709.                         adr[0]='\0';
  1710.                         //if ((opt.debug>0) && (opt.log!=NULL)) {
  1711.                         if (opt.errlog) {
  1712.                           fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link has already been written on disk, cancelled: %s"LF,save);
  1713.                           test_flush;
  1714.                         }
  1715.                       }
  1716. #endif                            
  1717.                       
  1718.                       if ((adr[0]!='\0') && (p_type!=2) && (p_type!=-2) && ( (forbidden_url!=1) || (just_test_it))) {  // si le fichier n'existe pas, ajouter α la liste                            
  1719.                         // n'y a-t-il pas trop de liens?
  1720.                         if (lien_tot+1 >= lien_max-4) {    // trop de liens!
  1721.                           printf("PANIC! : Too many URLs : >%d [%d]\n",lien_tot,__LINE__);
  1722.                           if (opt.errlog) {
  1723.                             fprintf(opt.errlog,LF"Too many URLs, giving up..(>%d)"LF,lien_max);
  1724.                             fprintf(opt.errlog,"To avoid that: use #L option for more links (example: -#L1000000)"LF);
  1725.                             test_flush;
  1726.                           }
  1727.                           if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  1728.                           XH_uninit;   // dΘsallocation mΘmoire & buffers
  1729.                           return 0;
  1730.                           
  1731.                         } else {    // noter le lien sur la listes des liens α charger
  1732.                           int pass_fix,dejafait=0;
  1733.                           
  1734.                           // Calculer la prioritΘ de ce lien
  1735.                           if ((opt.getmode & 4)==0) {    // traiter html aprΦs
  1736.                             pass_fix=0;
  1737.                           } else {    // vΘrifier que ce n'est pas un !html
  1738.                             if (!ishtml(fil))
  1739.                               pass_fix=1;        // prioritΘ infΘrieure (traiter aprΦs)
  1740.                             else
  1741.                               pass_fix=max(0,numero_passe);    // prioritΘ normale
  1742.                           }
  1743.                           
  1744.                           // vΘrifier que le lien n'a pas dΘja ΘtΘ notΘ
  1745.                           // si c'est le cas, alors il faut s'assurer que la prioritΘ associΘe
  1746.                           // au fichier est la plus grande des deux prioritΘs
  1747.                           //
  1748.                           // On part de la fin et on essaye de se presser (Θconomise temps machine)
  1749. #if HTS_HASH
  1750.                           {
  1751.                             int i=hash_read(&hash,save,"",0);      // lecture type 0 (sav)
  1752.                             if (i>=0) {
  1753.                               liens[i]->depth=maximum(liens[i]->depth,liens[ptr]->depth-1);
  1754.                               dejafait=1;
  1755.                             }
  1756.                           }
  1757. #else
  1758.                           {
  1759.                             register int l;
  1760.                             register int i;
  1761.                             l=strlen(save);  // opti
  1762.                             for(i=lien_tot-1;(i>=0) && (dejafait==0);i--) {
  1763.                               if (liens[i]->sav_len==l) {    // mΩme taille de chaεne
  1764.                                 if (strcmp(liens[i]->sav,save)==0) {    // existe dΘja
  1765.                                   liens[i]->depth=maximum(liens[i]->depth,liens[ptr]->depth-1);
  1766.                                   dejafait=1;
  1767.                                 }
  1768.                               }
  1769.                             }
  1770.                           }
  1771. #endif
  1772.                           
  1773.                           // le lien n'a jamais ΘtΘ crΘΘ.
  1774.                           // cette fois ci, on le crΘe!
  1775.                           if (!dejafait) {                                
  1776.                             //
  1777.                             // >>>> CREER LE LIEN <<<<
  1778.                             //
  1779.                             // enregistrer lien α charger
  1780.                             //liens[lien_tot]->adr[0]=liens[lien_tot]->fil[0]=liens[lien_tot]->sav[0]='\0';
  1781.                             // mΩme adresse: l'objet pΦre est l'objet pΦre de l'actuel
  1782.                             
  1783.                             // DEBUT ROBOTS.TXT AJOUT
  1784.                             if (!just_test_it) {
  1785.                               if (strfield(adr,"ftp://")==0) {    // non ftp
  1786.                                 if (opt.robots) {    // rΘcupΘrer robots
  1787.                                   if (ishtml(fil)!=0) {                       // pas la peine pour des fichiers isolΘs
  1788.                                     if (checkrobots(&robots,adr,"") != -1) {    // robots.txt ?
  1789.                                       checkrobots_set(&robots,adr,"");          // ajouter entrΘe vide
  1790.                                       if (checkrobots(&robots,adr,"") == -1) {    // robots.txt ?
  1791.                                         // enregistrer robots.txt (MACRO)
  1792.                                         liens_record(adr,"/robots.txt","","","");
  1793.                                         if (liens[lien_tot]==NULL) {  // erreur, pas de place rΘservΘe
  1794.                                           printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  1795.                                           if (opt.errlog) { 
  1796.                                             fprintf(opt.errlog,"Not enough memory, can not re-allocate %d bytes"LF,(add_tab_alloc+1)*sizeof(lien_url));
  1797.                                             test_flush;
  1798.                                           }
  1799.                                           if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  1800.                                           XH_uninit;    // dΘsallocation mΘmoire & buffers
  1801.                                           return 0;
  1802.                                         }  
  1803.                                         liens[lien_tot]->testmode=0;          // pas mode test
  1804.                                         liens[lien_tot]->link_import=0;       // pas mode import     
  1805.                                         liens[lien_tot]->premier=lien_tot;
  1806.                                         liens[lien_tot]->precedent=ptr;
  1807.                                         liens[lien_tot]->depth=0;
  1808.                                         liens[lien_tot]->pass2=max(0,numero_passe);
  1809.                                         liens[lien_tot]->retry=0;
  1810.                                         lien_tot++;  // UN LIEN DE PLUS
  1811. #if DEBUG_ROBOTS
  1812.                                         printf("robots.txt: added file robots.txt for %s\n",adr);
  1813. #endif
  1814.                                         if ((opt.debug>1) && (opt.log!=NULL)) {
  1815.                                           fspc(opt.log,"debug"); fprintf(opt.log,"robots.txt added at %s"LF,adr);
  1816.                                           test_flush;
  1817.                                         }
  1818.                                       } else {
  1819.                                         if (opt.errlog) {   
  1820.                                           fprintf(opt.errlog,"Unexpected robots.txt error at %d"LF,__LINE__);
  1821.                                           test_flush;
  1822.                                         }
  1823.                                       }
  1824.                                     }
  1825.                                   }
  1826.                                 }
  1827.                               }
  1828.                             }
  1829.                             // FIN ROBOTS.TXT AJOUT
  1830.                             
  1831.                             // enregistrer (MACRO)
  1832.                             liens_record(adr,fil,save,former_adr,former_fil);
  1833.                             if (liens[lien_tot]==NULL) {  // erreur, pas de place rΘservΘe
  1834.                               printf("PANIC! : Not enough memory [%d]\n",__LINE__);
  1835.                               if (opt.errlog) { 
  1836.                                 fprintf(opt.errlog,"Not enough memory, can not re-allocate %d bytes"LF,(add_tab_alloc+1)*sizeof(lien_url));
  1837.                                 test_flush;
  1838.                               }
  1839.                               if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } }
  1840.                               XH_uninit;    // dΘsallocation mΘmoire & buffers
  1841.                               return 0;
  1842.                             }  
  1843.                             
  1844.                             // mode test?
  1845.                             if (!just_test_it)
  1846.                               liens[lien_tot]->testmode=0;          // pas mode test
  1847.                             else
  1848.                               liens[lien_tot]->testmode=1;          // mode test
  1849.                             if (!import_done)
  1850.                               liens[lien_tot]->link_import=0;       // pas mode import
  1851.                             else
  1852.                               liens[lien_tot]->link_import=1;       // mode import
  1853.                             // Θcrire autres paramΦtres de la structure-lien
  1854.                             if ((meme_adresse) && (!import_done) && (liens[ptr]->premier != 0))
  1855.                               liens[lien_tot]->premier=liens[ptr]->premier;
  1856.                             else    // sinon l'objet pΦre est le prΘcΘdent lui mΩme
  1857.                               liens[lien_tot]->premier=lien_tot;
  1858.                             // liens[lien_tot]->premier=ptr;
  1859.                             
  1860.                             liens[lien_tot]->precedent=ptr;
  1861.                             // noter la prioritΘ
  1862.                             if (!set_prio_to)
  1863.                               liens[lien_tot]->depth=liens[ptr]->depth-1;
  1864.                             else
  1865.                               liens[lien_tot]->depth=max(0,min(liens[ptr]->depth-1,set_prio_to-1));         // PRIORITE NULLE (catch page)
  1866.                             // noter pass
  1867.                             liens[lien_tot]->pass2=pass_fix;
  1868.                             liens[lien_tot]->retry=opt.retry;
  1869.                             
  1870.                             //strcpy(liens[lien_tot]->adr,adr);
  1871.                             //strcpy(liens[lien_tot]->fil,fil);
  1872.                             //strcpy(liens[lien_tot]->sav,save); 
  1873.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  1874.                               if (!just_test_it) {
  1875.                                 fspc(opt.log,"debug"); fprintf(opt.log,"OK, NOTE: %s%s -> %s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil,liens[lien_tot]->sav);
  1876.                               } else {
  1877.                                 fspc(opt.log,"debug"); fprintf(opt.log,"OK, TEST: %s%s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil);
  1878.                               }
  1879.                               test_flush;
  1880.                             }
  1881.                             
  1882.                             lien_tot++;  // UN LIEN DE PLUS
  1883.                           } else { // if !dejafait
  1884.                             if ((opt.debug>1) && (opt.log!=NULL)) {
  1885.                               fspc(opt.log,"debug"); fprintf(opt.log,"link has already been recorded, cancelled: %s"LF,save);
  1886.                               test_flush;
  1887.                             }
  1888.                             
  1889.                           }
  1890.                           
  1891.                           
  1892.                         }   // si pas trop de liens
  1893.                       }   // si adr[0]!='\0'
  1894.                       
  1895.                       
  1896.                     }  // if adr[0]!='\0' 
  1897.                     
  1898.                   }  // if adr[0]!='\0'
  1899.                   
  1900.                 }    // if strlen(lien)>0
  1901.                 
  1902.               }   // if ok==0      
  1903.               
  1904.               adr=eadr-1;  // ** sauter
  1905.               
  1906.             }  // if (p) 
  1907.             
  1908.           }  // si '<' ou '>'
  1909.           
  1910.           // plus loin
  1911.           adr++;  
  1912.           
  1913.           // ----------
  1914.           // Θcrire peu α peu
  1915.           if ((opt.getmode & 1) && (ptr>0)) HT_ADD_ADR;
  1916.           lastsaved=adr;    // dernier Θcrit+1
  1917.           // ----------
  1918.           
  1919.           // pour les stats du shell si parsing trop long
  1920. #if HTS_ANALYSTE==2
  1921.           _hts_in_html_done=(100 * ((int) adr - (int) r.adr) ) / (int)(r.size);
  1922.           if (_hts_in_html_poll) {
  1923.             LLint nb;
  1924.             int nbk;
  1925.             _hts_in_html_poll=0;
  1926.             // temps α attendre, et remplir autant que l'on peut le cache (backing)
  1927.             back_wait(back,back_max,&opt,&cache,HTS_STAT.stat_timestart);        
  1928.             back_fillmax(back,back_max,&opt,&cache,liens,ptr,numero_passe,lien_tot);
  1929.  
  1930.             engine_stats();
  1931.             nb=back_transfered(HTS_STAT.stat_bytes,back,back_max);
  1932.             nbk=backlinks_done(liens,lien_tot,ptr);
  1933.             //if (!hts_htmlcheck_loop(back,back_max,-1,ptr,lien_tot,nb,new_stat_bytes,(int) (time_local()-stat_timestart),back_nsoc(back,back_max) )) {
  1934.             if (!hts_htmlcheck_loop(back,back_max,0,ptr,lien_tot,nb,new_stat_bytes,(int) (time_local()-HTS_STAT.stat_timestart),back_nsoc(back,back_max), HTS_STAT.stat_files,HTS_STAT.stat_updated_files,fspc(NULL,"error"),(int)HTS_STAT.rate,nbk )) {
  1935.               if (opt.errlog) {
  1936.                 fspc(opt.errlog,"info"); fprintf(opt.errlog,"Exit requested by shell or user"LF);
  1937.                 test_flush;
  1938.               } 
  1939.               exit_xh=1;  // exit requested
  1940.               XH_uninit;
  1941.               return 0;
  1942.               //adr = r.adr + r.size;  // exit
  1943.             } else if (_hts_cancel==1) {
  1944.               adr = r.adr + r.size;  // exit
  1945.               _hts_cancel=0;
  1946.             }
  1947.           }
  1948.  
  1949.           // refresh the backing system each 2 seconds
  1950.           if (engine_stats()) {
  1951.             back_wait(back,back_max,&opt,&cache,HTS_STAT.stat_timestart);        
  1952.             back_fillmax(back,back_max,&opt,&cache,liens,ptr,numero_passe,lien_tot);
  1953.           }
  1954. #endif
  1955.         } while(( ((int) adr) - ((int) r.adr) ) < r.size);
  1956. #if HTS_ANALYSTE==2
  1957.         _hts_in_html_parsing=0;  // flag
  1958.         _hts_cancel=0;           // pas de cancel
  1959. #endif
  1960.         if ((opt.getmode & 1) && (ptr>0)) {
  1961.           HT_ADD_END;    // achever
  1962.         }
  1963.         //
  1964.         //
  1965.         //
  1966.       }  // if !error
  1967.       
  1968.       
  1969.       if (opt.getmode & 1) { if (fp) { fclose(fp); fp=NULL; } }
  1970.       // sauver fichier
  1971.       //structcheck(savename);
  1972.       //filesave(r.adr,r.size,savename);
  1973.       
  1974. #if HTS_ANALYSTE
  1975.     }  // analyse OK
  1976. #endif
  1977.         
  1978.